# Imports
import numpy as np
import pandas as pd
from IPython.core.display import HTML
from typing import Dict, List
import plotly.express as px
#_COLOR_SCHEME = ["#adaee3", "#acbfef", "#a6d2f8", "#94e1ff",
# "#8df1ff", "#b3deb5", "#cee6b3", "#e3eb90",
# "#fff7ad", "#ffe597", "#ffd494", "#ffb8a2",
# "#ffbcbc", "#ffc8dc", "#e8bbf0", "#dfb2ff",
# "#cccccc"]
_COLOR_SCHEME = ["#087f8c", "#2d4059", "#ef476f", "#ffd166",
"#06d6a0", "#118ab2", "#fcbf49", "#d90429"]
_CALLOUT_PREFIX = "<div class='callout green-callout'><h4 style='font-family: verdana; color: #444444; font-size:110%;'>"
# CSS Style
def css_styling():
styles = """
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
.callout {
width: 80%;
margin: 20px auto;
padding: 30px;
position: relative;
border-radius: 5px;
box-shadow: 0 0 15px 5px rgb(255, 255, 255, 0.65);
}
.green-callout {
background-color: #d3efdc;
border-left: 5px solid #9dc2a9;
border-right: 5px solid #9dc2a9;
}
"""
# HTML('<style>{}</style>'.format(styles))
return HTML("<style>"+styles+"</style>")
css_styling()
pwd
'C:\\Users\\vspur'
# Global variables
_WIDTH = 900
_YEARS = [2018, 2019, 2020, 2021, 2022]
#/Users/sairamgajavalli/Desktop/advance_ba
_FILEPATHS = ["C:\\Users\\vspur\\multipleChoiceResponses_2018.csv",
"C:\\Users\\vspur\\multiple_choice_responses_2019.csv",
"C:\\Users\\vspur\\kaggle_survey_2020_responses.csv",
"C:\\Users\\vspur\\kaggle_survey_2021_responses.csv"]
#"/kaggle/input/kaggle-survey-2022/kaggle_survey_2022_responses.csv"
_COUNTRIES = [ "United States of America", "China", "India"]
"""
csv reading and data proecessing functions to select only
respondents that are employees and that work at the selected countries
"""
# Data reading and cleaning functions
#def keep_employees_in_2022df(df: pd.DataFrame) -> pd.DataFrame:
# df = df[df["Q5"] == "No"]
# df = df[~df["Q23"].str.contains(
# "not employed", case=False, na=False)]
# return df
def keep_employees_in_2019_to_2021df(df: pd.DataFrame) -> pd.DataFrame:
df = df[~df["Q5"].str.contains(
"Student", case=False, na=False)]
df = df[~df["Q5"].str.contains(
"employed", case=False, na=False)]
return df
def keep_employees_in_2018df(df: pd.DataFrame) -> pd.DataFrame:
df = df[~df["Q6"].str.contains(
"Student", case=False, na=False)]
df = df[~df["Q6"].str.contains(
"employed", case=False, na=False)]
return df
def keep_employees_in_df(df: pd.DataFrame, year: int) -> pd.DataFrame:
""" Keeps only the data of respondents that are employees/retirees."""
if year == 2022:
return keep_employees_in_2022df(df)
elif year in [2021, 2020, 2019]:
return keep_employees_in_2019_to_2021df(df)
elif year == 2018:
return keep_employees_in_2018df(df)
elif year == 2017:
df = df[~(df["Student Status"] == "Yes")]
df = df[~df["EmploymentStatus"].str.contains(
"Not employed", case=False, na=False)]
return df
def filter_country_in_df(
df: pd.DataFrame, year: int, country: str) -> pd.DataFrame:
""" Keeps only the data of respondents in a specific country."""
if year == 2022:
return df[df["Q4"] == country]
elif year >= 2018 and year <= 2021:
return df[df["Q3"] == country]
elif year == 2017:
return df[df["Country"] == country]
def read_year_country_data(
filepath: str, year: int, country: str) -> pd.DataFrame:
""" Reads the data in a csv file referring to a year of data and keeps
only the data filled by employees/retirees in a specific country"""
df = pd.read_csv(filepath, dtype=str)
df = filter_country_in_df(df, year, country)
df = keep_employees_in_df(df, year)
return df
def read_data_from_all_years(
years: List[int],
filepaths: List[str],
countries: List[str]) -> Dict[str, Dict[int, pd.DataFrame]]:
""" Returns the survey yearly data of respondents of
specific countries and that are employees/retirees."""
all_years_data = {}
for country in countries:
all_years_data[country] = {}
for year, filepath in zip(years, filepaths):
country_year_data = read_year_country_data(filepath, year, country)
assert country_year_data is not None
all_years_data[country][year] = country_year_data
return all_years_data
"""
Data processing functions to get one analysis_df with all the data
used in our analysis with labels fixed
"""
# https://www.economist.com/big-mac-index
# https://github.com/TheEconomist/big-mac-data
bigmac_price = {
"Japan": {
2022: 4.33424940302147,
2021: 4.52475306799529,
2020: 4.4222753559572,
2019: 4.26944490282339,
2018: 4.34051401396391,
},
"United States of America": {
2022: 4.54640118103503,
2021: 4.75935662467434,
2020: 4.71797911563537,
2019: 4.51734673325535,
2018: 4.57758988396032,
},
"China": {
2022: 3.64104793756501,
2021: 3.53487733533526,
2020: 3.30759148628284,
2019: 3.28136332239687,
2018: 3.23090135161334,
},
"India": {
2022: 3.42355999236184,
2021: 3.21614517431256,
2020: 2.96866603376469,
2019: 2.99855627219012,
2018: 2.94345521926975,
},
}
def get_age_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with age information."""
if year in [2022, 2018]:
age_data = df["Q2"]
elif year in [2021, 2020, 2019]:
age_data = df["Q1"]
age_data = age_data.replace("80+", "70+")
return age_data.replace("70-79", "70+")
def get_gender_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with gender information."""
if year==2022:
gender_data = df["Q3"]
gender_data = gender_data.replace("Male", "Man")
gender_data = gender_data.replace("Female", "Woman")
return gender_data
if year in [2021, 2020, 2019]:
gender_data = df["Q2"]
gender_data = gender_data.replace("Male", "Man")
gender_data = gender_data.replace("Female", "Woman")
return gender_data
elif year==2018:
gender_data = df["Q1"]
gender_data = gender_data.replace("Male", "Man")
gender_data = gender_data.replace("Female", "Woman")
return gender_data
def convert_salary_to_num_bigmacs(salary_data: List[float], year: int, country: str) -> List[float]:
return salary_data//bigmac_price[country][year]
def convert_str_range_to_median(str_range: str) -> float:
""" Converts the range of salary from a string to the median
of that range."""
if not pd.isnull(str_range):
str_range = str_range.replace(",", "")
str_range = str_range.replace("$", "")
str_range = str_range.replace(">", "")
str_range = str_range.replace("+", "")
str_range = str_range.replace(" ", "")
if "-" in str_range:
index = str_range.find("-")
smallest = float(str_range[:index])
largest = float(str_range[(index+1):])
result = (largest + smallest)/2
return result
else:
result = float(str_range)
return result
else:
return str_range
def process_2018_salary_ranges(str_range: str) -> str:
""" Adds ,000 to the first number in the salary range."""
if not pd.isnull(str_range):
if "-" in str_range:
index = str_range.find("-")
smallest = str_range[:index]
largest = str_range[(index+1):]
return f"{smallest},000-{largest}"
else:
return str_range
else:
return str_range
def process_salary_data(salary_data: pd.Series, year: int, country: str) -> List[float]:
if year == 2018:
salary_data = salary_data.apply(process_2018_salary_ranges)
salary_data = salary_data.apply(convert_str_range_to_median)
return convert_salary_to_num_bigmacs(salary_data, year, country)
def get_salary_data(df: pd.DataFrame, year: int, country: str) -> List[float]:
""" Returns the column from a dataframe with salary information."""
salary_data_select = {
2022: "Q29",
2021: "Q25",
2020: "Q24",
2019: "Q10",
2018: "Q9"
}
assert year in salary_data_select
salary_data = df[salary_data_select[year]]
salary_data = salary_data.replace("I do not wish to disclose my approximate yearly compensation", np.nan)
salary_data = process_salary_data(salary_data, year, country)
return salary_data
def get_title_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with job title information."""
if year==2022:
title_data = df["Q23"]
elif year in [2021, 2020, 2019]:
title_data = df["Q5"]
elif year == 2018:
title_data = df["Q6"]
title_data = title_data.replace("Data Analyst (Business, Marketing, Financial, Quantitative, etc)", "Data Analyst")
title_data = title_data.replace("Manager (Program, Project, Operations, Executive-level, etc)", "Manager")
return title_data
def get_formal_education_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with the level of formal
education. """
if year == 2022:
formal_education_data = df["Q8"]
else:
formal_education_data = df["Q4"]
formal_education_data = formal_education_data.replace(
"Professional doctorate", "Professional degree")
formal_education_data = formal_education_data.replace(
"Some college/university study without earning a bachelor’s degree",
"Some college/university study without<br>earning a bachelor's degree"
)
formal_education_data = formal_education_data.replace(
"Bachelor’s degree", "Bachelor's degree")
formal_education_data = formal_education_data.replace(
"Master’s degree", "Master's degree")
return formal_education_data
def get_multiple_answers_data(df: pd.DataFrame, data_select: dict, year: int) -> pd.Series:
columns = df.columns.values
columns_to_select = []
for column in columns:
if data_select[year] in column:
columns_to_select.append(column)
return df[columns_to_select].apply(lambda row: ' -- '.join(row.dropna().values.astype(str)), axis=1)
def get_learning_platforms_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with the learning platforms. """
data_select = {
2022: "Q6",
2021: "Q40",
2020: "Q37",
2019: "Q13",
2018: "Q36"
}
return get_multiple_answers_data(df, data_select, year)
def get_coding_experience_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with the coding experience in
years."""
if year == 2022:
experience_data = df["Q11"]
elif year in [2021, 2020]:
experience_data = df["Q6"]
elif year == 2019:
experience_data = df["Q15"]
else:
experience_data = df["Q24"]
experience_data = experience_data.replace(
"I have never written code and I do not want to learn",
"I have never written code")
experience_data = experience_data.replace(
"I have never written code but I want to learn",
"I have never written code")
experience_data = experience_data.replace(
"20-30 years",
"20+ years")
experience_data = experience_data.replace(
"30-40 years",
"20+ years")
experience_data = experience_data.replace(
"40+ years",
"20+ years")
experience_data = experience_data.replace("1-3 years", "1-5 years")
experience_data = experience_data.replace("3-5 years", "1-5 years")
experience_data = experience_data.replace("1-2 years", "1-5 years")
experience_data = experience_data.replace("2-5 years", "1-5 years")
experience_data = experience_data.replace("< 1 years", "< 1 year")
return experience_data
def get_ML_experience_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column from a dataframe with the ML experience in
years."""
if year == 2022:
experience_data = df["Q16"]
elif year in [2021, 2020]:
experience_data = df["Q15"]
elif year == 2019:
experience_data = df["Q23"]
experience_data = experience_data.replace("10-15 years", "10-20 years")
else:
experience_data = df["Q25"]
experience_data = experience_data.replace("10-15 years", "10-20 years")
experience_data = experience_data.replace("I have never studied machine learning but plan to learn in the future", "I do not use machine<br>learning methods")
experience_data = experience_data.replace("I have never studied machine learning and I do not plan to", "I do not use machine<br>learning methods")
experience_data = experience_data.replace("< 1 years", "< 1 year")
experience_data = experience_data.replace("Under 1 year", "< 1 year")
experience_data = experience_data.replace("20 or more years", "20+ years")
experience_data = experience_data.replace("I do not use machine learning methods", "I do not use machine<br>learning methods")
return experience_data
def get_important_activities_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns the column of a dataframe with the data on important
activities in the workplace."""
data_select = {
2022: "Q28",
2021: "Q24",
2020: "Q23",
2019: "Q9",
2018: "Q11"
}
return get_multiple_answers_data(df, data_select, year)
def get_language_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used programming language."""
data_select = {
2022: "Q12",
2021: "Q7",
2020: "Q7",
2019: "Q18",
2018: "Q16"
}
return get_multiple_answers_data(df, data_select, year)
def get_IDE_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used IDE."""
data_select = {
2022: "Q13",
2021: "Q9",
2020: "Q9",
2019: "Q16",
2018: "Q13"
}
return get_multiple_answers_data(df, data_select, year)
def get_notebook_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used notebook."""
data_select = {
2022: "Q14",
2021: "Q10",
2020: "Q10",
2019: "Q17",
2018: "Q14"
}
return get_multiple_answers_data(df, data_select, year)
def get_visualization_tools_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used visualization tools."""
data_select = {
2022: "Q15",
2021: "Q14",
2020: "Q14",
2019: "Q20",
2018: "Q21"
}
return get_multiple_answers_data(df, data_select, year)
def get_ML_frameworks_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used ML frameworks."""
data_select = {
2022: "Q17",
2021: "Q16",
2020: "Q16",
2019: "Q28",
2018: "Q19"
}
return get_multiple_answers_data(df, data_select, year)
def get_ML_algorithms_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used ML algorithms."""
data_select = {
2022: "Q18",
2021: "Q17",
2020: "Q17",
2019: "Q24",
}
return get_multiple_answers_data(df, data_select, year)
def get_computer_vision_methods_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used computer vision methods."""
data_select = {
2022: "Q19",
2021: "Q18",
2020: "Q18",
2019: "Q26",
}
return get_multiple_answers_data(df, data_select, year)
def get_NLP_methods_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the type of used NLP methods."""
data_select = {
2022: "Q20",
2021: "Q19",
2020: "Q19",
2019: "Q27",
}
return get_multiple_answers_data(df, data_select, year)
def get_ML_hardware_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on ML hardware usage."""
data_select = {
2022: "Q42",
2021: "Q12",
2020: "Q12",
2019: "Q21",
}
return get_multiple_answers_data(df, data_select, year)
def get_TPU_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on TPU usage frequency."""
data_select = {
2022: "Q43",
2021: "Q13",
2020: "Q13",
2019: "Q22",
}
tpu_data = df[data_select[year]]
tpu_data = tpu_data.replace("More than 25 times", "> 25 times")
tpu_data = tpu_data.replace("6-24 times", "6-25 times")
return tpu_data
def get_managed_ML_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on managed ML tools usage."""
data_select = {
2022: "Q37",
2021: "Q31_A", #
2020: "Q28_A", #
2018: "Q28" #
}
return get_multiple_answers_data(df, data_select, year)
def get_auto_ML_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on auto ML tools usage."""
data_select = {
2022: "Q38",
2021: "Q37_A",
2020: "Q34_A", #
2019: "Q33", #
}
return get_multiple_answers_data(df, data_select, year)
def get_ML_research_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on ML research."""
data_select = {
2022: "Q10"
}
ML_research_data = get_multiple_answers_data(df, data_select, year)
ML_research_data = ML_research_data.replace("Yes, the research made advances related to some novel machine learning method (theoretical research)", "Yes, theoretical research")
ML_research_data = ML_research_data.replace("Yes, the research made use of machine learning as a tool (applied research)", "Yes, applied research")
ML_research_data = ML_research_data.replace("Yes, the research made advances related to some novel machine learning method (theoretical research) -- Yes, the research made use of machine learning as a tool (applied research)", "Yes, both theoretical<br>and applied research")
return ML_research_data
def get_ML_serve_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on ML serve products."""
data_select = {
2022: "Q39"
}
return get_multiple_answers_data(df, data_select, year)
def get_ML_monitor_tools_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on ML monitoring tools."""
data_select = {
2022: "Q40",
2021: "Q38_A", #
2020: "Q35_A", #
}
return get_multiple_answers_data(df, data_select, year)
def get_ethical_AI_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on ethical AI tools."""
data_select = {
2022: "Q41",
}
return get_multiple_answers_data(df, data_select, year)
def get_cloud_computing_tools_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on Cloud Computing tools."""
data_select = {
2022: "Q31",
2021: "Q27_A", #
2020: "Q26_A", #
2019: "Q29", #
2018: "Q15" #
}
return get_multiple_answers_data(df, data_select, year)
def get_cloud_computing_products_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on Cloud Computing products."""
data_select = {
2022: "Q33",
2021: "Q29_A",
2020: "Q27_A", #
2019: "Q30", #
2018: "Q27" #
}
return get_multiple_answers_data(df, data_select, year)
def get_data_storage_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on used data storage products."""
data_select = {
2022: "Q34",
2021: "Q30_A", #
}
return get_multiple_answers_data(df, data_select, year)
def get_data_products_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on data products usage."""
data_select = {
2022: "Q35",
2021: "Q32_A", #
2020: "Q29_A", #
}
return get_multiple_answers_data(df, data_select, year)
def get_spent_money_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on money spent on cloud computing or ML."""
data_select = {
2022: "Q30",
2021: "Q26",
2020: "Q25",
2019: "Q11",
}
return df[data_select[year]]
def get_industry_type_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on Industry type."""
data_select = {
2022: "Q24",
2021: "Q20",
2018: "Q7"
}
return df[data_select[year]]
def get_company_size_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on Company size."""
data_select = {
2022: "Q25",
2021: "Q21",
2020: "Q20",
2019: "Q6",
}
company_size_data = df[data_select[year]]
company_size_data = company_size_data.replace(
"10,000 or more employees", "> 10,000 employees")
return company_size_data
def get_num_data_scientists_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on the number of data scientists in the job."""
data_select = {
2022: "Q26",
2021: "Q22",
2020: "Q21",
2019: "Q7",
}
return df[data_select[year]]
def get_ML_usage_in_business_data(df: pd.DataFrame, year: int) -> pd.Series:
""" Returns data on how ML is being used in business."""
data_select = {
2022: "Q27",
2021: "Q23",
2020: "Q22",
2019: "Q8",
2018: "Q10"
}
return df[data_select[year]]
def get_df_for_year_and_country(df: pd.DataFrame, year: int, country: str) -> pd.DataFrame:
""" Returns a dataframe with age, gender, salary and job title information
extracted from a dataframe."""
new_df = pd.DataFrame()
new_df["Age"] = get_age_data(df, year)
new_df["Gender"] = get_gender_data(df, year)
new_df["Salary"] = get_salary_data(df, year, country)
new_df["Title"] = get_title_data(df, year)
new_df["Formal Education"] = get_formal_education_data(df, year)
new_df["Learning Platforms"] = get_learning_platforms_data(df, year)
new_df["Coding Experience"] = get_coding_experience_data(df, year)
new_df["ML Experience"] = get_ML_experience_data(df, year)
new_df["Important Activities"] = get_important_activities_data(df, year)
new_df["Languages"] = get_language_data(df, year)
new_df["IDEs"] = get_IDE_data(df, year)
new_df["Notebooks"] = get_notebook_data(df, year)
new_df["Visualization Tools"] = get_visualization_tools_data(df, year)
new_df["ML Frameworks"] = get_ML_frameworks_data(df, year)
new_df["Cloud Computing Tools"] = get_cloud_computing_tools_data(df, year)
new_df["Cloud Computing Products"] = get_cloud_computing_products_data(df, year)
new_df["ML Usage in Business"] = get_ML_usage_in_business_data(df, year)
if year == 2018:
new_df["ML Algorithms"] = [None] * len(new_df["Age"])
new_df["Computer Vision Methods"] = [None] * len(new_df["Age"])
new_df["NLP Methods"] = [None] * len(new_df["Age"])
new_df["ML Hardware"] = [None] * len(new_df["Age"])
new_df["TPU"] = [None] * len(new_df["Age"])
new_df["AutoML"] = [None] * len(new_df["Age"])
new_df["Company Size"] = [None] * len(new_df["Age"])
new_df["Number of Data Scientists"] = [None] * len(new_df["Age"])
new_df["Money Spent"] = [None] * len(new_df["Age"])
else:
new_df["ML Algorithms"] = get_ML_algorithms_data(df, year)
new_df["Computer Vision Methods"] = get_computer_vision_methods_data(df, year)
new_df["NLP Methods"] = get_NLP_methods_data(df, year)
new_df["ML Hardware"] = get_ML_hardware_data(df, year)
new_df["TPU"] = get_TPU_data(df, year)
new_df["AutoML"] = get_auto_ML_data(df, year)
new_df["Company Size"] = get_company_size_data(df, year)
new_df["Number of Data Scientists"]= get_num_data_scientists_data(df, year)
new_df["Money Spent"] = get_spent_money_data(df, year)
if year == 2019:
new_df["Managed ML"] = [None] * len(new_df["Age"])
else:
new_df["Managed ML"] = get_managed_ML_data(df, year)
if year == 2022:
new_df["Published Papers"] = df["Q9"]
new_df["ML Research"] = get_ML_research_data(df, year)
new_df["ML Serve"] = get_ML_serve_data(df, year)
new_df["Ethical AI tools"] = get_ethical_AI_data(df, year)
else:
new_df["Published Papers"] = [None] * len(new_df["Age"])
new_df["ML Research"] = [None] * len(new_df["Age"])
new_df["ML Serve"] = [None] * len(new_df["Age"])
new_df["Ethical AI tools"] = [None] * len(new_df["Age"])
if year in [2020, 2019]:
new_df["Industry Type"] = [None] * len(new_df["Age"])
else:
new_df["Industry Type"] = get_industry_type_data(df, year)
if year in [2022, 2021]:
new_df["Data Storage Products"] = get_data_storage_data(df, year)
else:
new_df["Data Storage Products"] = [None] * len(new_df["Age"])
if year in [2019, 2018]:
new_df["Data Products"] = [None] * len(new_df["Age"])
new_df["ML Monitor Tools"] = [None] * len(new_df["Age"])
else:
new_df["Data Products"] = get_data_products_data(df, year)
new_df["ML Monitor Tools"] = get_ML_monitor_tools_data(df, year)
return new_df
def get_data_for_analysis(
all_years_data: Dict[str, Dict[int, pd.DataFrame]]) -> Dict[str, Dict[int, pd.DataFrame]]:
diversity_data_df = pd.DataFrame()
for country in all_years_data:
for year in all_years_data[country]:
df = (
get_df_for_year_and_country(
df = all_years_data[country][year],
year = year,
country = country
))
if country == "United States of America":
df["Country"] = ["U.S"]*len(df)
elif country == "Japan":
df["Country"] = ["Japan"]*len(df)
elif country == "China":
df["Country"] = ["China"]*len(df)
elif country == "India":
df["Country"] = ["India"]*len(df)
df["Year"] = [year]*len(df)
if len(diversity_data_df) == 0:
diversity_data_df = df
else:
diversity_data_df = pd.concat([diversity_data_df, df])
return diversity_data_df
"""
Functions to make calculations on the data from analysis_df
(salary averaged over certain groups, get percentage of respondents
that gave a certain answer to a question)
"""
def get_salary_avg_grouped_by_year_country_and_extra_variable(
analysis_df: pd.DataFrame, extra_var: str) -> pd.DataFrame:
avg_data = analysis_df[["Year", "Country", "Salary", extra_var]]
avg_data = avg_data.dropna(subset=["Year", "Country", "Salary", extra_var])
tmp_df = avg_data.groupby(
["Year", "Country", extra_var]).size().reset_index(name="Count")
tmp_df = tmp_df[tmp_df["Count"] >= 15] # consider only groups with more than 15 samples
avg_data["Keep"] = False
for c,year,extra_i in zip(tmp_df["Country"].values, tmp_df["Year"].values, tmp_df[extra_var].values):
data_filter = (avg_data["Country"]==c) & (avg_data["Year"]==year) & (avg_data[extra_var]==extra_i)
avg_data.loc[data_filter,"Keep"] = True
avg_data = avg_data[avg_data["Keep"] == True]
avg_data = avg_data.groupby(
["Year", "Country", extra_var], as_index=False).mean()
return avg_data
def get_percentage_of_multiple_choices_item(
analysis_df: pd.DataFrame, multiple_choice_column: str, choices:List[str],
re_expressions:List[str]) -> pd.DataFrame:
""" Returns a dataframe with the percentage of respondents for each
option in the choices list by country and year"""
countries = list(set(analysis_df["Country"].values))
years = list(set(analysis_df["Year"].values))
percentage_df_rows = []
for year in years:
for country in countries:
df = analysis_df[analysis_df["Year"]==year]
df = df[df["Country"] == country]
for re_expression, choice in zip(re_expressions, choices):
# df[multiple_choice_column] = df[multiple_choice_column].replace(np.nan, "None")
# assert (df[multiple_choice_column].str.count(choice) < 2).all()
# num_respondents = df[multiple_choice_column].str.count(choice).sum()
num_respondents = df[multiple_choice_column].str.contains(
re_expression, regex=True, na=False, case=True).sum()
percentage = round((num_respondents*100)/len(df), 1)
percentage_df_rows.append({
"Year": year,
"Country": country,
multiple_choice_column: choice,
"Percentage of Respondents": percentage
})
return pd.DataFrame(percentage_df_rows)
all_years_data = read_data_from_all_years(years=_YEARS,
filepaths=_FILEPATHS,
countries=_COUNTRIES)
analysis_df = get_data_for_analysis(all_years_data)
fig = px.histogram(analysis_df,
x="Year", color="Country", barmode="group", histfunc="count",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
category_orders={"Country": ["China🇨🇳", "India🇮🇳", "U.S.🇺🇸"]},
title="Number of tech workers that participated in the data science surveys")
fig.update_xaxes(type='category')
fig.update_yaxes(title="Number of tech workers")
fig.show()
fig = px.histogram(analysis_df,
x="Country", color="Gender", barmode="stack", histfunc="count",
barnorm="percent", animation_frame="Year",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
category_orders={"Country": ["China", "India", "U.S"],
"Gender":["Man", "Woman", "Prefer not to say", "Nonbinary"],
"Year": range(2018,2022)},
title="Gender distribution of people working in tech")
fig.update_xaxes(type='category')
fig.update_yaxes(title="Percentage of respondents (%)")
fig.show()
import plotly
import plotly.io as pio
from plotly.io import *
#!pip install -U kaleido
avg_data = analysis_df[analysis_df["Gender"].isin(["Man", "Woman"])]
avg_data = avg_data[["Country", "Gender", "Salary", "Year"]]
avg_data = avg_data.dropna(subset=["Gender", "Country", "Salary", "Year"])
avg_data = avg_data.groupby(["Country", "Gender", "Year"], as_index=False).mean()
fig = px.bar(avg_data,
x="Country", y="Salary",
color="Gender",
animation_frame="Year",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
category_orders={"Country": ["China", "India", "U.S"],
"Gender":["Man", "Woman"],
"Year": range(2018,2022)},
title="Average yearly compensation of men and women")
fig.update_layout(yaxis_range=[0,38000])
fig.update_yaxes(title="Average yearly compensation (number of Big Macs)")
fig.update_xaxes(type='category')
fig.show()
pio.write_html(fig, 'avg_compensation.html', include_plotlyjs='cdn')
# Create a video of the graph
#pio.write_image(fig, 'avg_compensation.mp4', width=_WIDTH, height=600, scale=1, engine='kaleido')
x="Country", color="Age", barmode="stack", histfunc="count",
barnorm="percent", animation_frame="Year",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
category_orders={"Country": ["China", "India", "U.S"],
"Age": ["18-21", "22-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-69", "70+"],
"Year": range(2018,2022)},
title="Distribution of People's Age")
fig.update_xaxes(type='category') fig.update_yaxes(title="Percentage of respondents (%)") fig.update_layout(legend_title="Age group (years)") fig.show()
avg_data = analysis_df[["Year", "Country", "Salary"]]
avg_data = avg_data.groupby(["Year", "Country"], as_index=False).mean()
fig = px.scatter(avg_data, x="Year", y="Country",
size="Salary", color="Country",
hover_name="Country", size_max=60,
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Average yearly compensation (in number of Big Macs) per country and year")
fig.update_xaxes(type='category')
fig.show()
avg_data = get_salary_avg_grouped_by_year_country_and_extra_variable(analysis_df, "Age")
fig = px.bar(avg_data,
x="Country", y="Salary",
color="Age",
animation_frame="Year",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Average Compensation of Work Force w.r.t age, country and year",
category_orders={"Country": ["China", "India", "U.S"],
"Age": ["18-21", "22-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-69", "70+"],
"Year": range(2018,2022)})
fig.update_layout(yaxis_range=[0,39000], legend_title="Age group (years)")
fig.update_yaxes(title="Average Compensation")
fig.update_xaxes(type='category')
fig.show()
avg_data = analysis_df[["Year", "Country", "Salary", "Age"]]
avg_data = avg_data[avg_data["Age"].isin(["18-21", "22-24", "25-29", "30-34", "35-39"])]
avg_data = avg_data.dropna(subset=["Year", "Country", "Salary"])
avg_data = avg_data.drop(columns=["Age"])
avg_data = avg_data.groupby(["Year", "Country"], as_index=False).mean()
fig = px.bar(avg_data,
x="Year", y="Salary",
color="Country",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Average compensation of workers between 18 and 39 years of age")
fig.update_layout(yaxis_range=[0,38000])
fig.update_yaxes(title="Average Compensation (number of Big Macs)")
fig.update_xaxes(type='category')
fig.show()
x="Country", color="Formal Education", barmode="stack", histfunc="count",
barnorm="percent", animation_frame="Year",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Distribution of education of Workforce",
category_orders={"Country": ["China", "India", "U.S"],
"Formal Education": ["I prefer not to answer",
"No formal education past high school",
"Some college/university study without<br>earning a bachelor's degree",
"Bachelor's degree", "Master's degree", "Doctoral degree",
"Professional degree"],
"Year": range(2018,2022)})
fig.update_xaxes(type='category') fig.update_yaxes(title="Percentage of respondents (%)") fig.show()
percentage_df = analysis_df[["Year", "Country", "Formal Education"]]
choices = ["Master's degree", "Doctoral degree",
"Bachelor's degree", "Professional degree"]
re_expressions = []
for choice in choices:
re_expressions.append(fr"\b{choice}\b")
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = percentage_df,
multiple_choice_column = "Formal Education",
choices=choices,
re_expressions=re_expressions)
percentage_df = percentage_df.groupby(["Country", "Year"], as_index=False).sum()
fig = px.bar(percentage_df,
x="Year", y="Percentage of Respondents",
color="Country",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Percentage of respondents with university degree",
category_orders={"Country": ["China", "India", "U.S"]})
fig.update_layout(yaxis_range=[0,100])
fig.update_yaxes(title="Percentage of respondents (%)")
fig.update_xaxes(type='category')
fig.show()
C:\Users\vspur\AppData\Local\Temp\ipykernel_15120\3414050516.py:16: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
choices = [
"resulting in a university degree"]
re_expressions = [fr"\b{choices[0]}\b"]
percentage_df = analysis_df[["Year", "Country", "Learning Platforms"]]
percentage_df = percentage_df[percentage_df["Year"].isin([2019,2020,2021,2022])]
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = percentage_df,
multiple_choice_column = "Learning Platforms",
choices=choices,
re_expressions=re_expressions)
fig = px.bar(percentage_df,
x="Year", y="Percentage of Respondents",
color="Country",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Percentage of respondents with university degree in Data Science or related field",
category_orders={"Country": ["China", "India", "U.S"]})
fig.update_layout(yaxis_range=[0,35])
fig.update_yaxes(title="Percentage of respondents (%)")
fig.update_xaxes(type='category')
fig.show()
papers_df = analysis_df[analysis_df["Year"] == 2021]
papers_df = papers_df[["Country", "Published Papers"]]
fig = px.histogram(papers_df,
x="Country", color="Published Papers", barmode="group", histfunc="count",
barnorm="percent",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Percentage of respondents with university degree that have/have not published papers",
category_orders={"Country": ["China", "India", "U.S"]})
fig.update_xaxes(type='category')
fig.update_layout(legend_title="Have they published<br>academic papers?")
fig.update_yaxes(title="Percentage of respondents (%)")
fig.show()
papers_df = analysis_df[analysis_df["Year"] == 2021]
papers_df = papers_df[["Country", "ML Research"]]
papers_df = papers_df.dropna(subset=["ML Research"])
papers_df = papers_df[papers_df["ML Research"].isin(["No", "Yes, theoretical research", "Yes, applied research", "Yes, both theoretical<br>and applied research"])]
fig = px.histogram(papers_df,
x="Country", color="ML Research", barmode="stack", histfunc="count",
barnorm="percent",
width=_WIDTH, height=500,
color_discrete_sequence=_COLOR_SCHEME,
title="Did their research make use of machine learning?",
category_orders={"Country": ["China", "India", "U.S"]})
fig.update_xaxes(type='category')
fig.update_layout(yaxis_range=[0,100], legend_title="Respondents' answers")
fig.update_yaxes(title="Percentage of respondents (%)")
fig.show()
fig = px.histogram(analysis_df,
x="Country", color="Coding Experience", barmode="stack", histfunc="count",
barnorm="percent", animation_frame="Year",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Coding experience groups distribution per country and year",
category_orders={"Country": ["China", "India", "U.S"],
"Coding Experience": ["I have never written code",
"< 1 year", "1-5 years",
"5-10 years", "10-20 years", "20+ years"],
"Year": range(2018,2022)})
fig.update_xaxes(type='category')
fig.update_yaxes(title="Percentage of respondents (%)")
fig.show()
avg_data = get_salary_avg_grouped_by_year_country_and_extra_variable(analysis_df, "Coding Experience")
fig = px.bar(avg_data,
x="Country", y="Salary",
color="Coding Experience",
animation_frame="Year",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Average compensation of tech workers by coding experience, country and year",
category_orders={"Country": ["China", "India", "U.S"],
"Coding Experience": ["I have never written code", "< 1 year", "1-5 years", "5-10 years", "10-20 years", "20+ years"],
"Year": range(2018,2022)})
fig.update_layout(yaxis_range=[0,43000])
fig.update_yaxes(title="Average yearly compensation (number of Big Macs)")
fig.update_xaxes(type='category')
fig.show()
# get only data for 2022
role_df = analysis_df[analysis_df["Year"] == 2021]
role_df = role_df[["Title", "Country", "Coding Experience"]]
# remove China because there are not many respondents per role
role_df = role_df[role_df["Country"].isin(["China", "India", "U.S"])]
role_df = role_df.dropna(subset=["Title"])
# remove the roles that have less than 15 respondents in each country
tmp_df = role_df.groupby(
["Country", "Title"]).size().reset_index(name="Count")
tmp_df = tmp_df[tmp_df["Count"] >= 15] # consider only groups with more than 15 samples
role_df["Keep"] = False
for c,title in zip(tmp_df["Country"].values, tmp_df["Title"].values):
data_filter = (role_df["Country"]==c) & (role_df["Title"]==title)
role_df.loc[data_filter,"Keep"] = True
role_df = role_df[role_df["Keep"] == True]
# remove the roles that have no respondents in some countries
role_df = role_df[role_df["Title"] != "Statistician"]
role_df = role_df[role_df["Title"] != "Data Architect"]
role_df = role_df[role_df["Title"] != "Data Administrator"]
title_values = list(set(role_df["Title"].values))
title_values = sorted(title_values)
fig = px.histogram(role_df,
x="Title", color="Coding Experience", barmode="stack", histfunc="count",
barnorm="percent", animation_frame="Country",
width=_WIDTH, height=800,
color_discrete_sequence=_COLOR_SCHEME,
title="Coding experience in the context of job titles in 2022",
category_orders={"Country": ["India🇮🇳", "Japan🇯🇵", "U.S.🇺🇸"],
"Coding Experience": ["I have never written code",
"< 1 year", "1-5 years",
"5-10 years", "10-20 years", "20+ years"],
"Title":title_values})
fig.update_xaxes(type='category')
fig.update_yaxes(title="Percentage of respondents (%)")
sliders = [dict(y=-0.2)]
updatemenus = [dict(y=-0.2)]
fig.update_layout(sliders=sliders, updatemenus=updatemenus)
fig.show()
fig = px.histogram(analysis_df[analysis_df["Year"].isin([2021,2020,2019])],
x="Country", color="ML Experience", barmode="stack", histfunc="count",
barnorm="percent", animation_frame="Year",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Years of experience with ML",
category_orders={"Country": ["China", "India", "U.S"],
"ML Experience": ["I do not use machine<br>learning methods",
"< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
"5-10 years", "10-20 years", "20+ years"],
"Year": range(2018,2022)})
fig.update_xaxes(type="category")
fig.update_yaxes(title="Percentage of respondents (%)")
fig.update_layout(legend_title="Experience with ML")
fig.show()
avg_data = get_salary_avg_grouped_by_year_country_and_extra_variable(
analysis_df, "ML Experience")
fig = px.bar(avg_data[avg_data["Year"].isin([2021,2020,2019])],
x="Country", y="Salary",
color="ML Experience",
animation_frame="Year",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="How experience in ML is related to compensation?",
category_orders={"Country": ["China", "India", "U.S"],
"ML Experience": ["I do not use machine<br>learning methods",
"< 1 year", "1-2 years", "2-3 years", "3-4 years", "4-5 years",
"5-10 years", "10-20 years", "20+ years"],
"Year": range(2018,2022)})
fig.update_layout(yaxis_range=[0,55000], legend_title="Experience in ML")
fig.update_yaxes(title="Average yearly compensation (number of Big Macs)")
fig.update_xaxes(type='category')
fig.show()
def make_percentage_bar_plot(
percentage_df: pd.DataFrame,
color_column: str,
yaxis_range: List[float],
title: str,
labels: Dict[str, str] = None,
legend_title: str = None)->None:
if labels is not None:
category_orders = {
"Country": ["China", "India", "U.S"],
color_column: labels.values(),
"Year": range(2018,2021)}
else:
category_orders = {
"Country": ["China", "India", "U.S"],
"Year": range(2018,2021)}
fig = px.bar(percentage_df,
x="Country", y="Percentage of Respondents",
color=color_column,
animation_frame="Year",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title=title,
category_orders=category_orders)
fig.update_layout(yaxis_range=yaxis_range)
if legend_title is not None:
fig.update_layout(legend_title=legend_title)
fig.update_yaxes(title="Percentage of respondents (%)")
fig.update_xaxes(type='category')
fig.show()
choices = [
"Analyze and understand data to influence product or business decisions",
"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data",
"Build prototypes to explore applying machine learning to new areas",
"Build and/or run a machine learning service that operationally improves my product or workflows",
"Experimentation and iteration to improve existing ML models",
"Do research that advances the state of the art of machine learning",
"None of these activities are an important part of my role at work",
"Other"]
labels ={
"Analyze and understand data to influence product or business decisions": "Analyze data to influence<br>product or business decisions",
"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data": "Build and/or run the data<br>infrastructure that my business<br>uses",
"Build prototypes to explore applying machine learning to new areas": "Build prototypes to explore<br>applying ML to new areas",
"Build and/or run a machine learning service that operationally improves my product or workflows":"Build and/or run a ML service<br>that operationally improves<br>my product or workflows",
"Experimentation and iteration to improve existing ML models": "Experimentation and iteration to<br>improve existing ML models",
"Do research that advances the state of the art of machine learning": "Research to advance the<br>state of the art of ML",
"None of these activities are an important part of my role at work": "None of the activities above",
"Other": "Other"
}
re_expressions = []
for choice in choices:
re_expressions.append(fr"\b{choice}\b")
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = analysis_df[["Year", "Country", "Important Activities"]],
multiple_choice_column = "Important Activities",
choices=choices,
re_expressions=re_expressions)
for key, value in labels.items():
percentage_df = percentage_df.replace(key, value)
make_percentage_bar_plot(
percentage_df=percentage_df,
color_column= "Important Activities",
yaxis_range=[0,65],
title="Activities that make up an important part of respondents' role at work",
labels=labels,
legend_title="Activities")
choices = ["None"]
re_expressions = []
for choice in choices:
re_expressions.append(fr"\b{choice}\b")
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = analysis_df[["Year", "Country", "Languages"]],
multiple_choice_column = "Languages",
choices=choices,
re_expressions=re_expressions)
fig = px.bar(percentage_df,
x="Year", y="Percentage of Respondents",
color="Country",
barmode="group",
width=_WIDTH, height=600,
color_discrete_sequence=_COLOR_SCHEME,
title="Percentage of respondents with coding experience that do not code on a regular basis",
category_orders={"Country": ["China", "India", "U.S"]})
fig.update_layout(yaxis_range=[0,3.5])
fig.update_yaxes(title="Percentage of respondents (%)")
fig.update_xaxes(type='category')
fig.show()
choices = [
"Python",
"R",
"SQL",
"C",
"C++",
"Java",
"Javascript",
"Julia",
"Bash",
"MATLAB",
"None",
"Other",
"C#"]
re_expressions = []
for choice in choices:
re_expressions.append(fr"\b{choice}\b")
re_expressions[4] = fr"C\+\+"
re_expressions[-1] = fr"C#"
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = analysis_df[["Year", "Country", "Languages"]],
multiple_choice_column = "Languages",
choices=choices,
re_expressions=re_expressions)
percentage_df = percentage_df[percentage_df["Languages"] != "None"]
make_percentage_bar_plot(
percentage_df=percentage_df,
color_column= "Languages",
yaxis_range=[0,90],
title="Programming languages that are used on a regular basis",
legend_title="Programming languages")
choices = [
"Kaggle Notebooks",
"Colab Notebooks",
"None",
"Other"]
re_expressions = []
for choice in choices:
re_expressions.append(fr"\b{choice}\b")
re_expressions[0] = fr"\bKaggle Notebooks\b|\bKaggle Kernels\b"
re_expressions[1] = fr"\bColab Notebooks\b|\bGoogle Colab\b"
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = analysis_df[["Year", "Country", "Notebooks"]],
multiple_choice_column = "Notebooks",
choices=choices,
re_expressions=re_expressions)
make_percentage_bar_plot(
percentage_df=percentage_df,
color_column= "Notebooks",
yaxis_range=[0,50],
title="Percentage of tech workers that use hosted notebook products")
choices = [
"Matplotlib",
"Seaborn",
"Plotly / Plotly Express",
"Ggplot / ggplot2",
"Shiny",
"D3 js",
"Altair",
"Bokeh",
"Geoplotlib",
"Leaflet / Folium",
# "Pygal",
# "Dygraphs",
# "Highcharter",
"None",
"Other"]
re_expressions = []
for choice in choices:
re_expressions.append(fr"\b{choice}\b")
re_expressions[2] = fr"\bPlotly\b"
re_expressions[3] = fr"\bggplot\b|ggplot2"
re_expressions[5] = fr"D3"
re_expressions[9] = fr"\bLeaflet\b|\bFolium\b"
percentage_df = get_percentage_of_multiple_choices_item(
analysis_df = analysis_df[["Year", "Country", "Visualization Tools"]],
multiple_choice_column = "Visualization Tools",
choices=choices,
re_expressions=re_expressions)
make_percentage_bar_plot(
percentage_df=percentage_df,
color_column= "Visualization Tools",
yaxis_range=[0,75],
title="Visualization libraries used on a regular basis by tech workers with coding experience")